# General imports - here we import the libraries we will use in this notebook
import numpy as np  # NumPy, a fundamental package for scientific computing with Python.

import pandas as pd # Pandas, a library providing high-performance, easy-to-use data structures and data analysis tools for Python
from scipy import stats # SciPy, a Python-based ecosystem of open-source software for mathematics, science, and engineering.

import matplotlib.pyplot as plt # Matplotlib, a 2D plotting library which produces publication quality figures in a variety of formats
import seaborn as sns # Seaborn, a Python visualization library based on Matplotlib, provides a high-level interface for drawing attractive statistical graphics.

# General imports - here we import the libraries we will use in this notebook
import numpy as np  # NumPy, a fundamental package for scientific computing with Python.

import pandas as pd # Pandas, a library providing high-performance, easy-to-use data structures and data analysis tools for Python
from scipy import stats # SciPy, a Python-based ecosystem of open-source software for mathematics, science, and engineering.

import matplotlib.pyplot as plt # Matplotlib, a 2D plotting library which produces publication quality figures in a variety of formats
import seaborn as sns # Seaborn, a Python visualization library based on Matplotlib, provides a high-level interface for drawing attractive statistical graphics.

# Task 1 - Part 1

# TODO;Read the data
telomere_df = ... # Hint: Use pd.read_csv() function

# TODO: Create age groups for visualization

# Create figure
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

# TODO; Gender distribution

# TODO: Batch distribution

# TODO: Age groups distribution


# Show subplots
plt.tight_layout()
plt.show()

# Task 1 - Part 1

# TODO;Read the data
telomere_df = ... # Hint: Use pd.read_csv() function

# TODO: Create age groups for visualization

# Create figure
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

# TODO; Gender distribution

# TODO: Batch distribution

# TODO: Age groups distribution


# Show subplots
plt.tight_layout()
plt.show()

# Task 1 - Part 2

# Box plots with statistics
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

# TODO: Gender boxplot with statistics


# TODO: Batch boxplot with statistics

# TODO: Age group boxplot with statistics


# Show subplots
plt.tight_layout()
plt.show()

# Task 1 - Part 3

# Distribution analysis with pdf and Q-Q plot
fig, ax1 = plt.subplots(1, 1, figsize=(5, 5))

# TODO: Extract quantiles using stats.probplot() function

# TODO: Plot them using plt.scatter

# Show plot
plt.tight_layout()
plt.show()

# TODO: Read the methylation data
methylation_df = ... # Hint: Use pd.read_csv() function

# Task 2 - Part 1

# First plot - CpG site comparison
# Create 4 subplots in a 2x2 grid
fig, axes = plt.subplots(2, 2, figsize=(7, 7))


# TODO: Plot each methylation level vs age of each CpG site in its subplot


# Show subplots
plt.tight_layout()
plt.show()

# Task 2 - Part 2

from scipy.special import logit

# TODO: Create new column with logit of methylation level
methylation_df['Methylation_logit'] = # Hint: Use apply() to the methylation column

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(7, 7))

# TODO: Plot each methylation level vs age of each CpG site in its subplot

plt.tight_layout()
plt.show()

# Task 2 - Part 3

# Create a heatmap per plate (6) and per methylation site (4)
# Each heatmap represents a 96 weel plate (8 x 12 grid - A-H/1-12)

# Create a 6x4 grid of subplots
fig, axes = plt.subplots(6, 4, figsize=(12, 12))

# TODO: Create a heatmap for each plate and each CpG site

# Show subplots
plt.tight_layout()
plt.show()

# Task 3 - Part 1

# TODO: Read protein modification data
protein_df = ... # Hint: Use pd.read_csv() function

# TODO: Let's focus on H3K4me3 as an example protein
protein = # "H3K4me3"

# Create figure for distribution inspection
fig, ax = plt.subplots(figsize=(10, 6))

# TODO: Create histogram of counts


plt.show()

# Task 3 - Part 2

# Create figure for distribution inspection
fig, ax1 = plt.subplots(figsize=(7, 7))

# TODO: Create (again) histogram of counts

# TODO: Get points of the theoretical distribution
x = ... # Hint: Check np.arange
y = ... # Hint: Use pmf() function from the distribution of your choice

# TODO: Plot the theoretical distribution (use ax1.plot)


plt.show()

# Task 3 - Part 3

f, ax = plt.subplots(figsize=(7, 7))
# TODO: Create Q-Q plot to assess fit

plt.show()

Hands-on 2: Data I/O and Distribution Analysis in Python¶

Recap Section¶

Problem Statement¶

Deliverables¶

IMPORTANT: Format Requirements¶

Summary of deliverables:¶

Task 1¶

Task 2¶

Task 3¶